import glob
import re
import os
def main():
    '''Parses the files and pulls out the species hits and counts into a four-column file'''
    output= '/data/gos/all.genes.gos.results/abund/all.test.txt'
    output =open(output, 'w', 1)
    output.write('gene\tsample\tspecies\tcount\n')
    for file in glob.glob("/data/gos/all.genes.gos.results/*.txt"):
        filename = os.path.basename(file)
        match = re.match('^(.*)\..*\.vs\.(.*)\.txt', filename)
        sample = match.group(2)
        gene = match.group(1)



        handler = open(file, 'r', 1)
        capture=False
        speciesCount = {}
        for line in handler:
            if capture:
                match = re.match('[^#].*\t.*\t(.*)', line)
                if match:
                    species = match.group(1)
                    if species:
                        if species in speciesCount:
                            speciesCount[species] +=1
                        else:
                            speciesCount[species] =1
                        species = None
            if re.match('^#fragment.*', line):
                capture= not capture
        handler.close()

        for i in speciesCount.keys():
            output.write('%s\t%s\t%s\t%d\n' % (gene, sample, i, speciesCount[i] ))
        handler.close()
    output.close()
if __name__ == "__main__":
    main()